*******************************************************************************
**
** Construct main hospital-year panel data set 
** Inputs: the cleaned AHA file, case mix index files, provider of services files, and various hospital ID crosswalks 
** Output: acq_cleaned_complete_<date>.dta 
**
*******************************************************************************
 

capture log close
clear all 

* locals and directories 
local fpath_main = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/"

local fpath_crosswalks = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/crosswalks"

local fpath_hcris = "/homes/nber/shruthi-dua51934/sacarny-DUA51934/shruthi-dua51934/replication_files/build/hcris/output"

local fpath_casemix = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/casemix/output"

local fpath_aha = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/aha/output"

local fpath_pos = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/pos/data"


local fpath_output = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/output"

log using "`fpath_main'/log_runbuild.log", replace 

* start and end years for the data
global STARTYEAR = 2003
global ENDYEAR = 2014

* output file name 
local output = "acq_cleaned_complete_20230606"

* keep hospitals that are present in the first and last years of the sample?
global KEEP_FIRST_LAST_YEAR = 1




****** bring in the HCRIS files 
use "`fpath_hcris'/hcris_merged_hospyear", clear 

* keep if year <= 2014
keep if year <= 2014

* bring in case mix index from case mix files: see process_casemix.do 
merge 1:1 pn year using "`fpath_casemix'/casemix_processed.dta", keep( match) nogen 

* merge m:1 to the POS file 
merge m:1 pn year using "`fpath_pos'/pos.dta", keep(master match) nogen 
keep if termcode == 0 
drop if missing(termcode)

* save to a tempfile 
tempfile pnyr
save pnyr, replace
 
******* bring in the AHA data
** crosswalk for AHA to PN 
use "`fpath_crosswalks'/aha_pn_xwalk_all.dta", clear 
drop if pn == "777777"

* expand the xw to the number of years in the data
expand $ENDYEAR - $STARTYEAR + 1 

* gen year sequence 

bys id pn: egen year = seq(), from($STARTYEAR) to($ENDYEAR) block(1)

* merge m:1 to the cms file (with hcris and provider of services info) 
* if a pn was EVER associated with an aha id, try to find its record in the 
* hcompare/ hcris data 
merge m:1 pn year using pnyr, keep(match) nogen
 
* variables to take a weighted sum or average over PN 
local beds "active beds_adultped_wtd beds_totadultped_wtd beds_total_min beds_total_max beds_total_wtd availbeddays_adultped ipbeddays_adultped ipdischarges_adultped"
local flows "uccare_chg_harmonized ipcharge opcharge ppamount totampayable totdischarge medicare_discharge uccare_cost_harmonized netpatrev othinc opexp othexp donations invinc iphosprev ipgenrev ipicrev iprcrev ipancrev ipoprev iptotrev opancrev opoprev optotrev tottotrev chguccare totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 nreports nfmt96 nfmt10 nno_uncomp beds_tot income totcost margin"
*local sumvars "`beds'"

* other variables to take a weighted average over PN 
local other_avgvars "cah urbancbsa casemixindex frac_year_covered ccr_min ccr_max ccr_wtd cost_* capinv_tot"
*"cah urbancbsa casemixindex frac_year_covered ccr_min ccr_max ccr_wtd cost_* capinv_tot gen_*"

* collapse to id-year level  
collapse (mean) `other_avgvars' `beds' `flows' (max) termcode [aw = beds_tot], by(id year)

* merge to AHA files (see load_aha_data.do)
merge 1:1 id year using "`fpath_aha'/complete_aha.dta", gen(aha_merge) keep(match)
*tempfile idyr
save idyr, replace 
 
 
****** collapse to synthetic id-year
* load the AHA ID to synthetic ID xwalk 
use "`fpath_crosswalks'/treat_aha_xwalk.dta", clear

* clean up the names 
rename id_aha synth_id
label var synth_id "Synthetic AHA ID" 
rename id_aha_indiv id 
label var id "Individual AHA ID"

* make sure all AHA ID variables are string 
tostring synth_id id , replace 
foreach v in synth_id id {
	gen t = length(`v')
	replace `v' = "0" + `v' if t < 7
	drop t 
}

* create a synthetic 2003 observation (replicate 2004)
expand 2 if year == 2004, gen(r)
replace year = 2003 if r == 1
drop r 

* merge to the AHA / PN files above 
merge m:1 id year using idyr, gen(xwalk_merge) 

* replace the ID for synthetic ID where relevant 
replace id = synth_id if !missing(synth_id)

* clean up the treatment group indicators 
rename legacy_acq acq_legacy 
rename other acq_other 


* collapse to synthetic ID - year level 
duplicates drop 
local maxvars "target acq_legacy target2 acq_other synthetic divested_yr divested_during_analysis divested_after_analysis divested_ever acquired_yr acquired_during_analysis hrrcode"
collapse (mean) `other_avgvars' `beds' `flows' hospbd cntrl admtot fte (max) `maxvars'   [aw = hospbd], by(id year)
label var id "Synthetic AHA ID"

* data check 
egen tempvar =  rowtotal(target acq_legacy target2 acq_other )
assert tempvar == 1 if acquired_yr > 0 & !missing(acquired_yr)
drop tempvar 

* save as a tempfile 
tempfile synthidyr
save `synthidyr', replace 

****** bring in the synthetic PNS from the xwalk 
* load the PN to synthetic ID xwalk 
use "`fpath_crosswalks'/acq_aha_pn_xwalk_byyr.dta", clear

* rename and clean up 
*rename id_aha id 
label var id "Synthetic AHA ID"

expand 2 if year == 2004, gen(r)
replace year = 2003 if r == 1
drop r 


* make sure PN and ID variables are string 
tostring id pn, replace 
foreach v in id pn { 
	gen t = length(`v')
	egen m = max(t)
	replace `v' ="0" + `v' if t < m 
	drop t m 
}

* merge with the synthetic id - year file 
keep id year pn 
duplicates drop 
sort id year 
merge m:1 id year using `synthidyr', keep(using match)

* if missing PN, get it from the AHA files 
merge 1:1 id year using "`fpath_aha'/complete_aha.dta", keepusing(mcrnum) keep(master match) nogen
replace pn = mcrnum if missing(pn)
drop if missing(pn)






******** clean up 
* income per bed 
gen incomepbed = income / (1000 * hospbd) if !missing(income)

* cost per bed 
gen costpbed = totcost / (1000 * hospbd) if !missing(totcost)

* cost of uncomp care 
gen costuccare_v2010_pbed = costuccare_v2010 / (1000 * hospbd) if !missing(costuccare_v2010)

gen lhospbd = log(hospbd)
label var lhospbd "Log(beds)"

* indicators for acquirer hospitals' acquisitions 
gen acqhosp = (acquired_yr > 0) & !missing(acquired_yr)

* gen profit and operating margins 
gen profit_margin = (income-totcost)/income if !missing(income) & !missing(totcost)
gen op_inc = income - donations - invinc if !missing(income) & !missing(totcost)
gen op_margin = (op_inc - totcost)/op_inc if !missing(income) & !missing(totcost)
gen nonacq_forprofit = inrange(cntrl, 30, 33) & !missing(cntrl) & !acqhosp
gen forprofit = inrange(cntrl, 30, 33) & !missing(cntrl)
replace forprofit = 1 if (acq_legacy == 1) | (target == 1) | (target2 == 1) | (acq_other == 1)

* gen variables used to plot by acquisition group  
gen acq_group = "Acq Legacy" if acq_legacy == 1 & acquired_yr > 0 
replace acq_group = "Target 2" if target2 == 1 & acquired_yr > 0 
replace acq_group = "Target" if target == 1 & acquired_yr > 0 
replace acq_group = "Acq Other" if acq_other == 1 & acquired_yr > 0 
gen order = 1 if acq_group == "Acq Legacy"
replace order = 2 if acq_group == "Target 2" 
replace order = 3 if acq_group == "Target" 
replace order = 4 if acq_group == "Acq Other"
label define orderlab 1 "Acq Legacy" 2 "Target 2" 3 "Target" 4 "Acq Other" 
label values order orderlab 


* label variables for tables
label var beds_tot "Total beds from POS file" 
label var admtot "Total hospital admissions" 
label var costuccare_v2010_pbed "Cost of uncomp care (\$1000s/ bed)" 
label var incomepbed "Income (\$1000s/ bed)" 
label var costpbed "Cost ($1000s/ bed)"
label var profit_margin "Profit margin"


if $KEEP_FIRST_LAST_YEAR {
	* keep the hospitals that show up in the first (2003) and last years
	preserve 
	collapse (min) min_year=year (max) max_year=year, by(id)
	drop if min_year!=$STARTYEAR
	drop if max_year!=$ENDYEAR
	tempfile idlist
	keep id 
	save `idlist'
	restore
	merge m:1 id using `idlist', keep(match) nogen 
}

* generate three year group indicators 
gen ind03_06 = inrange(year, 2003, 2006)
gen ind07 = year == 2007
gen ind08_11 = inrange(year, 2008, 2011) 
gen ind12_14 = inrange(year, 2012, 2014)




* replace missing acquisition group values as 0 
foreach var in acq_legacy target target2 acq_other {
	replace `var' = 0 if missing(`var')
}

* gen group for group*yearind interactions 
* because rifreg can't handle interactions 
foreach ind in ind03_06 ind07 ind08_11 ind12_14 {
	
	local name = subinstr(subinstr("`ind'", "_", "-", 1), "ind", "", 1)
	
	foreach gr in acq_legacy target target2 acq_other{
		
		gen `ind'`gr' = `ind' * `gr' 
		
		if "`gr'" == "acq_legacy" {
			local cat "Acq Legacy" 
		}
		if "`gr'" == "target2" {
			local cat "Target 2"
		}
		if "`gr'" == "target" {
			local cat "Target"
		}
		if "`gr'" == "acq_other" {
			local cat "Other Acq" 
		}
		
		label var `ind'`gr' "`cat' * `name'"
	
	}

} 

* Dafny price index 
gen discount = netpatrev/ tottotrev
gen dafny_price = (iprcrev + ipancrev) * discount - totampayable - ppamount
label var dafny_price "Dafny price index"
replace dafny_price = dafny_price / ((totdischarge - medicare_discharge) * casemixindex)
replace dafny_price = dafny_price / 1000
*count if dafny_price < 0 

* there is one hospital with multiple PNs and AHA IDs 
drop if pn == "420082"


* drop immediately divested hospitals 
preserve 
import excel "`fpath_crosswalks'/Acq AHA Xwalk (Dalton).xlsx", sheet("combined (stata input)") firstrow clear
keep if (divested_yr <= 2008) | (divested_yr == 2015 & target2 == 1)
rename id_aha_dalton id
keep id 
tempfile drop_divested
save `drop_divested', replace
restore 

cap drop _merge 
merge m:1 id using `drop_divested', keep(master) nogen



* save 
sort id year 
cap drop _merge 
save "`fpath_output'/`output'", replace




log close 
















